Set up
Set seed
set.seed(5)
Load libraries
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(Seurat)
## Attaching SeuratObject
library(ggalluvial)
Load wu et all data, normalize and run PCA/UMAP
if(file.exists('analysis_files/s5_so_wu.rds')){
so_wu <- readRDS('analysis_files/s5_so_wu.rds')
}else{
print('Wu et al Seurat object not found, loading from publication files')
# Read counts
wu_10x <- Read10X('/Users/Nick/OneDrive - Oregon Health & Science University/public_data/wu_et_al_2021_nat_gen/data/')
# Read metadata & convert to Seurat accepted format
wu_meta <- read_csv('/Users/Nick/OneDrive - Oregon Health & Science University/public_data/wu_et_al_2021_nat_gen/Whole_miniatlas_meta.csv',
lazy = FALSE)[-1,] %>%
mutate(Percent_mito = as.numeric(Percent_mito)) %>%
mutate(nCount_RNA = as.numeric(nCount_RNA)) %>%
mutate(nFeature_RNA = as.numeric(nFeature_RNA))
wu_meta_df <- as.data.frame(wu_meta)
row.names(wu_meta_df) <- wu_meta_df$NAME
# Read patient metadata and add to wu_meta_df
wu_patient_meta <- read_csv('/Users/Nick/OneDrive - Oregon Health & Science University/public_data/wu_et_al_2021_nat_gen/supplementary_table1.csv',
lazy = FALSE) %>%
janitor::clean_names() %>%
mutate(patient_id = paste0('CID', str_remove(case_id, pattern = '-|A|N'))) %>%
mutate(her2_status = subtype_by_ihc %in% c('HER2+/ER+', 'HER2+')) %>%
mutate(er_status = subtype_by_ihc %in% c('HER2+/ER+', 'ER+'))
# Add ER and HER2 status back to scRNA-seq meta for easier plotting later
wu_meta_df$her2_status <- plyr::mapvalues(x = str_remove(wu_meta_df$Patient, pattern = 'A|N'),
from = wu_patient_meta$patient_id,
to = wu_patient_meta$her2_status)
wu_meta_df$er_status <- plyr::mapvalues(x = str_remove(wu_meta_df$Patient, pattern = 'A|N'),
from = wu_patient_meta$patient_id,
to = wu_patient_meta$er_status)
wu_meta_df$subtype_by_ihc <- plyr::mapvalues(x = str_remove(wu_meta_df$Patient, pattern = 'A|N'),
from = wu_patient_meta$patient_id,
to = wu_patient_meta$subtype_by_ihc)
# Confirm all cell barcodes have meta data
sum(colnames(wu_10x) %in% row.names(wu_meta_df))/length(unique(colnames(wu_10x), row.names(wu_meta_df)))
# Create seurat object
so_wu <- CreateSeuratObject(counts = wu_10x,
meta.data = wu_meta_df,
project = 'wu_2020_natgen')
wu_10x <- wu_meta <- wu_meta_df <- NULL
gc()
## Normalize, find variable features and scale
so_wu <- NormalizeData(so_wu)
so_wu <- FindVariableFeatures(so_wu)
top_vf <- head(VariableFeatures(so_wu), 50)
LabelPoints(plot = VariableFeaturePlot(so_wu),
points = top_vf)
so_wu <- ScaleData(so_wu)
## PCA and UMAP
so_wu <- RunPCA(so_wu,
npcs = 50)
ElbowPlot(so_wu,
ndims = 50)
so_wu <- RunUMAP(so_wu,
dims = 1:20)
saveRDS(so_wu, 'analysis_files/s5_so_wu.rds')
}
Wu et all analysis
Visualize UMAP
DimPlot(so_wu,
raster = TRUE)+
coord_equal()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
DimPlot(so_wu,
raster = TRUE,
group.by = 'celltype_major',
label = TRUE)+
coord_equal()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
DimPlot(so_wu,
raster = TRUE,
group.by = 'normal_cell_call',
label = TRUE)+
coord_equal()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
DimPlot(so_wu,
raster = TRUE,
group.by = 'celltype_minor',
label = TRUE)+
coord_equal()+
theme(legend.position = 'none')
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
DimPlot(so_wu,
raster = TRUE,
group.by = 'celltype_minor',
label = FALSE)+
coord_equal()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
DimPlot(so_wu,
raster = TRUE,
group.by = 'celltype_subset',
label = TRUE)+
coord_equal()+
theme(legend.position = 'none')
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
DimPlot(so_wu,
raster = TRUE,
group.by = 'subtype',
label = TRUE,
shuffle = TRUE)+
coord_equal()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
DimPlot(so_wu,
raster = TRUE,
group.by = 'subtype',
label = TRUE)+
facet_wrap(~subtype)
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
Celltype assignments
Review wu et all celltype labels
so_wu@meta.data %>%
as.tibble() %>%
dplyr::select(celltype_major, celltype_minor, celltype_subset) %>%
distinct() %>%
DT::datatable()
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
so_wu_labels_freq <- so_wu@meta.data %>%
dplyr::select(celltype_major, celltype_minor, celltype_subset) %>%
group_by(celltype_major, celltype_minor, celltype_subset) %>%
summarize(n_cells = n())
## `summarise()` has grouped output by 'celltype_major', 'celltype_minor'. You can
## override using the `.groups` argument.
ggplot(so_wu_labels_freq, aes(y = n_cells, axis1 = celltype_major, axis2 = celltype_minor, axis3 = celltype_subset, fill = celltype_major))+
geom_alluvium()+
geom_stratum()+
geom_label(stat = "stratum", aes(label = after_stat(stratum)))+
theme_minimal()+
scale_x_discrete(limits = c('celltype_major', 'celltype_minor', 'celltype_subset'))+
labs(fill = 'celltype_major')
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
Recode from wu celltype_major to ‘lineage’ equivalent from Myc;Ptenfl analysis
unique(so_wu@meta.data$celltype_major)
## [1] "Endothelial" "CAFs" "PVL"
## [4] "B-cells" "T-cells" "Myeloid"
## [7] "Normal Epithelial" "Plasmablasts" "Cancer Epithelial"
celltype_major_l1_dict <- rbind(c('Endothelial', 'endothelial'),
c('CAFs', 'fibroblast'),
c('PVL', 'perivascular'),
c('B-cells', 'lymphoid'),
c('T-cells', 'lymphoid'),
c('Myeloid', 'myeloid'),
c('Normal Epithelial', 'epithelial'),
c('Plasmablasts', 'lymphoid'),
c('Cancer Epithelial', 'epithelial'))
DT::datatable(celltype_major_l1_dict)
so_wu@meta.data$celltype_l1 <- plyr::mapvalues(x = so_wu@meta.data$celltype_major,
from = celltype_major_l1_dict[,1],
to = celltype_major_l1_dict[,2])
DimPlot(so_wu,
group.by = 'celltype_major',
label = TRUE)+
coord_equal()+
theme(legend.position = 'none')
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
DimPlot(so_wu,
group.by = 'celltype_l1',
label = TRUE)+
theme(legend.position = 'none')
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
Find orthologous features
Load Myc;Ptenfl data
so_merge <- readRDS('analysis_files/s4_so_merge.rds')
Idents(so_merge) <- 'cluster_l2'
scPred: Wu celltype_minor -> mycpten data
Build predictive model
if(file.exists('analysis_files/s5_so_wu_shared_trained.rds')){
print('Classifier already trained, not re-computing')
}else{
print('Training model')
so_wu_shared <- getFeatureSpace(so_wu_shared,
'celltype_minor')
so_wu_shared <- trainModel(so_wu_shared,
model = 'mda')
print('Saving trained model')
saveRDS(so_wu_shared, file = 'analysis_files/s5_so_wu_shared_trained.rds')
# saveRDS(so_wu_shared@misc$scPred, file = 'analysis_files/s5_scpred_wu_shared.rds')
}
## [1] "Classifier already trained, not re-computing"
Look at results
so_wu_shared@misc$scPred
## 'scPred' object
## ✔ Prediction variable = celltype_minor
## ✔ Discriminant features per cell type
## ✔ Training model(s)
## Summary
##
## |Cell type | n| Features|Method | ROC| Sens| Spec|
## |:---------------------------|-----:|--------:|:------|-----:|-----:|-----:|
## |B cells Memory | 2581| 50|mda | 0.990| 0.946| 0.991|
## |B cells Naive | 625| 50|mda | 0.993| 0.958| 0.981|
## |CAFs MSC iCAF-like | 3153| 50|mda | 0.987| 0.903| 0.986|
## |CAFs myCAF-like | 3420| 50|mda | 0.996| 0.858| 0.993|
## |Cancer Basal SC | 4312| 50|mda | 0.967| 0.774| 0.979|
## |Cancer Cycling | 5359| 50|mda | 0.969| 0.774| 0.972|
## |Cancer Her2 SC | 3708| 50|mda | 0.972| 0.763| 0.976|
## |Cancer LumA SC | 7742| 50|mda | 0.988| 0.973| 0.967|
## |Cancer LumB SC | 3368| 50|mda | 0.973| 0.902| 0.957|
## |Cycling PVL | 50| 50|mda | 0.988| 0.760| 0.984|
## |Cycling T-cells | 1528| 50|mda | 0.982| 0.781| 0.992|
## |Cycling_Myeloid | 463| 50|mda | 0.980| 0.758| 0.978|
## |DCs | 955| 50|mda | 0.997| 0.924| 0.991|
## |Endothelial ACKR1 | 4611| 50|mda | 0.999| 0.973| 0.993|
## |Endothelial CXCL12 | 1644| 50|mda | 0.998| 0.941| 0.993|
## |Endothelial Lymphatic LYVE1 | 203| 50|mda | 0.979| 0.808| 0.980|
## |Endothelial RGS5 | 1147| 50|mda | 0.994| 0.939| 0.983|
## |Luminal Progenitors | 1992| 50|mda | 0.999| 0.985| 0.998|
## |Macrophage | 5929| 50|mda | 0.994| 0.917| 0.988|
## |Mature Luminal | 1265| 50|mda | 0.989| 0.795| 0.986|
## |Monocyte | 2328| 50|mda | 0.992| 0.885| 0.983|
## |Myoepithelial | 1098| 50|mda | 1.000| 0.982| 0.999|
## |NK cells | 1846| 50|mda | 0.994| 0.904| 0.986|
## |NKT cells | 1122| 50|mda | 0.993| 0.855| 0.986|
## |Plasmablasts | 3524| 50|mda | 0.998| 0.980| 0.998|
## |PVL Differentiated | 3487| 50|mda | 0.998| 0.926| 0.995|
## |PVL Immature | 1886| 50|mda | 0.996| 0.905| 0.990|
## |T cells CD4+ | 19231| 50|mda | 0.988| 0.941| 0.970|
## |T cells CD8+ | 11487| 50|mda | 0.979| 0.859| 0.968|
get_probabilities(so_wu_shared) %>%
head()
## Endothelial ACKR1 Endothelial RGS5 Endothelial CXCL12
## CID3586_AAGACCTCAGCATGAG 1 2.003150e-07 4.839682e-22
## CID3586_AAGGTTCGTAGTACCT 1 7.397193e-05 2.601051e-24
## CID3586_ACCAGTAGTTGTGGCC 1 7.352694e-09 4.002143e-23
## CID3586_ACCCACTAGATGTCGG 1 7.303812e-01 2.831623e-23
## CID3586_ACTGATGGTCAACTGT 1 2.536747e-02 2.212364e-15
## CID3586_ACTTGTTAGGGAAACA 1 8.911516e-06 3.305400e-30
## CAFs MSC iCAF-like CAFs myCAF-like PVL Differentiated
## CID3586_AAGACCTCAGCATGAG 1.868347e-21 4.099174e-19 4.077561e-38
## CID3586_AAGGTTCGTAGTACCT 8.341724e-09 1.160443e-19 9.414944e-18
## CID3586_ACCAGTAGTTGTGGCC 1.248101e-22 4.813984e-16 9.975417e-21
## CID3586_ACCCACTAGATGTCGG 4.469753e-09 2.785414e-21 1.015892e-24
## CID3586_ACTGATGGTCAACTGT 8.192184e-11 1.000265e-24 1.912015e-33
## CID3586_ACTTGTTAGGGAAACA 3.197197e-10 6.744133e-22 2.050644e-25
## PVL Immature Endothelial Lymphatic LYVE1
## CID3586_AAGACCTCAGCATGAG 1.515444e-29 0.3568192593
## CID3586_AAGGTTCGTAGTACCT 1.845669e-26 0.2676977985
## CID3586_ACCAGTAGTTGTGGCC 8.502484e-29 0.0005419999
## CID3586_ACCCACTAGATGTCGG 1.253373e-22 0.0001489643
## CID3586_ACTGATGGTCAACTGT 3.105859e-25 0.0005213064
## CID3586_ACTTGTTAGGGAAACA 7.892852e-24 0.7960905689
## B cells Memory B cells Naive T cells CD8+ T cells CD4+
## CID3586_AAGACCTCAGCATGAG 1.183670e-07 1.122082e-44 2.144809e-10 7.955333e-06
## CID3586_AAGGTTCGTAGTACCT 5.889650e-05 3.199620e-11 1.705763e-05 3.448587e-06
## CID3586_ACCAGTAGTTGTGGCC 5.350744e-04 1.904061e-11 2.082512e-04 1.032236e-12
## CID3586_ACCCACTAGATGTCGG 4.302226e-08 5.108091e-11 7.562800e-12 2.637520e-05
## CID3586_ACTGATGGTCAACTGT 1.788890e-05 1.343078e-09 8.677196e-09 8.326259e-06
## CID3586_ACTTGTTAGGGAAACA 8.160168e-24 1.225567e-11 4.658081e-05 5.261893e-07
## NK cells Cycling T-cells NKT cells Macrophage
## CID3586_AAGACCTCAGCATGAG 4.160530e-10 1.076226e-10 5.205474e-08 2.152898e-17
## CID3586_AAGGTTCGTAGTACCT 2.852535e-10 6.803017e-08 9.583610e-16 2.592094e-13
## CID3586_ACCAGTAGTTGTGGCC 4.354209e-08 2.053181e-07 3.561144e-08 2.223510e-17
## CID3586_ACCCACTAGATGTCGG 2.761888e-13 8.245699e-12 2.874633e-14 1.859639e-16
## CID3586_ACTGATGGTCAACTGT 8.206602e-15 3.081613e-11 1.756351e-08 7.244806e-14
## CID3586_ACTTGTTAGGGAAACA 2.259881e-10 5.484310e-16 9.693075e-08 2.263377e-19
## Monocyte Cycling_Myeloid DCs
## CID3586_AAGACCTCAGCATGAG 2.097473e-16 3.209078e-21 1.714277e-17
## CID3586_AAGGTTCGTAGTACCT 2.781148e-13 1.889234e-07 1.524376e-15
## CID3586_ACCAGTAGTTGTGGCC 6.713838e-11 1.237469e-09 6.374311e-13
## CID3586_ACCCACTAGATGTCGG 1.487502e-11 3.368482e-08 1.949465e-12
## CID3586_ACTGATGGTCAACTGT 2.172737e-19 7.703806e-16 9.727881e-23
## CID3586_ACTTGTTAGGGAAACA 1.613937e-15 4.253898e-12 1.013405e-13
## Myoepithelial Luminal Progenitors Mature Luminal
## CID3586_AAGACCTCAGCATGAG 2.261935e-55 1.844934e-27 2.347579e-05
## CID3586_AAGGTTCGTAGTACCT 4.061256e-50 2.609801e-25 4.162487e-05
## CID3586_ACCAGTAGTTGTGGCC 1.560566e-55 1.153612e-28 4.298928e-05
## CID3586_ACCCACTAGATGTCGG 3.032393e-58 1.912968e-22 1.790728e-04
## CID3586_ACTGATGGTCAACTGT 1.873096e-50 5.904651e-31 5.638292e-04
## CID3586_ACTTGTTAGGGAAACA 3.115335e-51 1.174433e-28 1.477751e-04
## Plasmablasts Cancer Cycling Cancer Her2 SC
## CID3586_AAGACCTCAGCATGAG 7.399475e-21 4.937141e-06 2.302996e-06
## CID3586_AAGGTTCGTAGTACCT 1.245102e-17 1.497658e-05 6.648358e-07
## CID3586_ACCAGTAGTTGTGGCC 2.661955e-19 6.936914e-06 3.690251e-06
## CID3586_ACCCACTAGATGTCGG 1.802947e-13 3.594268e-06 5.971771e-06
## CID3586_ACTGATGGTCAACTGT 1.398909e-18 8.908713e-07 1.936374e-05
## CID3586_ACTTGTTAGGGAAACA 3.603510e-15 1.516829e-05 1.128390e-07
## Cancer LumB SC Cancer Basal SC Cycling PVL
## CID3586_AAGACCTCAGCATGAG 9.944962e-10 8.567279e-06 5.688735e-13
## CID3586_AAGGTTCGTAGTACCT 4.367787e-08 3.867812e-05 7.328937e-14
## CID3586_ACCAGTAGTTGTGGCC 1.761924e-05 4.478141e-06 1.395569e-27
## CID3586_ACCCACTAGATGTCGG 5.558163e-05 4.676754e-06 3.908383e-14
## CID3586_ACTGATGGTCAACTGT 4.344627e-14 1.446185e-06 4.150082e-13
## CID3586_ACTTGTTAGGGAAACA 4.477743e-15 7.689223e-05 1.803005e-26
## Cancer LumA SC
## CID3586_AAGACCTCAGCATGAG 3.777797e-06
## CID3586_AAGGTTCGTAGTACCT 9.585564e-07
## CID3586_ACCAGTAGTTGTGGCC 1.944880e-06
## CID3586_ACCCACTAGATGTCGG 2.094946e-06
## CID3586_ACTGATGGTCAACTGT 7.478631e-09
## CID3586_ACTTGTTAGGGAAACA 7.570103e-06
plot_probabilities(so_wu_shared)
Convert so_merge to subset of ortholog features
if(file.exists('analysis_files/s5_so_merge_subset_mda_scpred.rds')){
print('s5_so_merge_subset_mda_scpred.rds already exists, not recomputing')
}else{
print('Applying classifier to myc;ptenfl data')
so_merge_subset <- subset(so_merge,
features = mycpten_features$orig.features[!is.na(mycpten_features$converted.features)])
# Rename the features
rownames(so_merge_subset@assays$RNA@counts) <- plyr::mapvalues(x = rownames(so_merge_subset@assays$RNA@counts),
from = mycpten_features$orig.features,
to = mycpten_features$converted.features,
warn_missing = FALSE)
rownames(so_merge_subset@assays$RNA@data) <- plyr::mapvalues(x = rownames(so_merge_subset@assays$RNA@data),
from = mycpten_features$orig.features,
to = mycpten_features$converted.features,
warn_missing = FALSE)
# Find new variable features and rescale
so_merge_subset <- scPredict(so_merge_subset, so_wu_shared)
# Visualize assigned label
DimPlot(so_merge_subset,
group.by = 'scpred_prediction')+
coord_equal()
# Celltype_full vs assigned label
freq <- so_merge_subset@meta.data %>%
dplyr::select(celltype_full, scpred_prediction) %>%
table()
nmi <- aricode::NMI(c1 = so_merge_subset@meta.data$celltype_full,
c2 = so_merge_subset@meta.data$scpred_prediction)
ari <- aricode::ARI(c1 = so_merge_subset@meta.data$celltype_full,
c2 = so_merge_subset@meta.data$scpred_prediction)
pheatmap::pheatmap(prop.table(freq, margin = 1),
display_numbers = freq,
main = paste0('NMI: ', nmi, '\n ARI: ', ari))
saveRDS(so_merge_subset, file = 'analysis_files/s5_so_merge_subset_mda_scpred.rds')
}
## [1] "s5_so_merge_subset_mda_scpred.rds already exists, not recomputing"
Split by celltype_l1
# heatmap split by lineage
for(i in c('epithelial', 'fibroblast', 'myeloid', 'lymphoid')){
curr_subset <- subset(so_merge_subset, subset = lineage == i)
curr_freq <- curr_subset@meta.data %>%
dplyr::select(celltype_full, scpred_prediction) %>%
table()
labels <- curr_subset@meta.data %>%
dplyr::select(celltype_full, scpred_prediction)
# filter empty rows/columns
curr_freq <- curr_freq[rowSums(curr_freq) != 0, colSums(curr_freq) != 0]
p1 <- pheatmap::pheatmap(prop.table(curr_freq, margin = 1),
display_numbers = round(prop.table(curr_freq, margin = 1), 2),
main = i)
print(p1)
}
Cross-species integration with UINMF via RLiger
convert from mouse gene to human gene via known orthologs.
library(rliger)
## Loading required package: cowplot
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: patchwork
##
## Attaching package: 'patchwork'
## The following object is masked from 'package:cowplot':
##
## align_plots
library(SeuratWrappers)
if(file.exists('analysis_files/s5_xspecies_uinmf_so.rds')){
print('Loading saved UINMF integrated .rds')
species.liger <- readRDS('analysis_files/s5_xspecies_uinmf_rliger.rds')
so_rliger <- readRDS('analysis_files/s5_xspecies_uinmf_so.rds')
}else{
print('Did not find saved Seurat integrated data, processing new one')
#Remove files from classifier
rm(so_wu_shared)
rm(so_merge_subset)
# Pull metadata
wu_meta <- so_wu@meta.data
mycpten_meta <- so_merge@meta.data
# pull counts
wu_counts <- so_wu@assays$RNA@counts
mycpten_counts <- so_merge@assays$RNA@counts
# Remove unneeded seurat objects to make room for rliger object
rm(so_wu)
rm(so_merge)
gc()
# Convert row names to human ortholog when possible
rownames(mycpten_counts) <- mycpten_features$new.features
tibble(n_mouse_features = nrow(mycpten_counts),
n_human_features = nrow(wu_counts),
n_shared_features = length(intersect(rownames(wu_counts), rownames(mycpten_counts)))) %>%
knitr::kable()
# Add ortholog filtered count
# Combine into cross-species liger object
species.liger <- createLiger(list(mouse = mycpten_counts,
human = wu_counts),
take.gene.union = FALSE)
}
## [1] "Loading saved UINMF integrated .rds"
Normalize, select genes, scale and perform joint matrix factorization
if(!file.exists('analysis_files/s5_xspecies_uinmf_so.rds')){
species.liger <- normalize(species.liger)
species.liger <- selectGenes(species.liger,
var.thres= 0.3,
unshared = TRUE,
unshared.datasets = list(1,2),
unshared.thresh = 0.3)
species.liger <- scaleNotCenter(species.liger)
species.liger <- optimizeALS(species.liger,
lambda = 5,
use.unshared = TRUE,
thresh = 1e-10,
k = 50,
nrep = 5)
species.liger <- quantile_norm(species.liger,
ref_dataset = "human")
}
Quantile Norm, clustering and process UMAP
if(!file.exists('analysis_files/s5_xspecies_uinmf_so.rds')){
# delete below
species.liger <- optimizeALS(species.liger,
lambda = 5,
use.unshared = TRUE,
thresh = 1e-10,
k = 50,
nrep = 5)
species.liger <- quantile_norm(species.liger,
ref_dataset = "human")
# Delete above
species.liger <- louvainCluster(species.liger)
species.liger <- runUMAP(species.liger)
saveRDS(species.liger, 'analysis_files/s5_xspecies_uinmf_rliger.rds')
# Convert to seurat object and save
# Convert to seurat object and run umap + find neighbors + unoptimized clustering
so_rliger <- ligerToSeurat(species.liger)
so_rliger <- RunUMAP(so_rliger, reduction = 'inmf', dims = 1:50)
so_rliger <- FindNeighbors(so_rliger,
reduction = 'inmf',
dims = 1:50)
so_rliger <- FindClusters(so_rliger,
resolution = 0.9,
algorithm = 1) # Using optimized resolution from code below
# Add metadata back from two original seurat objects
wu_meta$barcode <- paste0('human_', rownames(wu_meta))
mycpten_meta$barcode <- paste0('mouse_', rownames(mycpten_meta))
so_rliger_meta <- so_rliger@meta.data
so_rliger_meta$barcode <- rownames(so_rliger_meta)
so_rliger_meta <- full_join(x = so_rliger_meta,
y = wu_meta,
by = 'barcode',
suffix = c('', ''))
so_rliger_meta <- left_join(x = so_rliger_meta,
y = mycpten_meta,
by = 'barcode',
suffix = c('', ''))
so_rliger_meta$species <- str_split(so_rliger_meta$barcode, pattern = '_', simplify = TRUE)[,1]
so_rliger@meta.data <- so_rliger_meta
rownames(so_rliger@meta.data) <- so_rliger_meta$barcode
}
Optimize silhouette width on integrated seurat object NOTE: can’t run leiden algorithm, reverting to louvain (leiden requires conversion to dense matrix format)
Prepare silhoutte scoring function
library(bluster)
cluster_sweep <- function(resolution = seq(from = 0.3, to = 0.8, by = 0.1), seurat_object, embedding, ndims = ncol(emedding)){
for(i in resolution){
seurat_object <- FindClusters(seurat_object,
resolution = i,
algorithm = 1)
p1 <- DimPlot(seurat_object,
group.by = 'seurat_clusters',
label = TRUE)+
coord_equal()+
ggtitle(paste0('Resolution: ', resolution))
print(p1)
curr_clusters <- seurat_object@meta.data$seurat_clusters
sil <- approxSilhouette(x = embedding,
clusters = curr_clusters)
boxplot(split(sil$width, curr_clusters),
main = paste0('Resolution: ', i, '\n Mean sil.width: ', mean(sil$width)))
best.choice <- ifelse(sil$width > 0,
curr_clusters,
sil$other)
table(Assigned=curr_clusters, Closest=best.choice)
#
rmsd <- clusterRMSD(embedding, curr_clusters)
barplot(rmsd,
main = paste0('Resolution: ', i, '\n Mean rmsd: ', mean(rmsd)))
if(i == resolution[1]){
qc <- tibble(res = i,
n_clusters = length(unique(sil$cluster)),
mean_sil_width = mean(sil$width),
mean_rmsd = mean(rmsd))
}else{
qc <- rbind(qc,
tibble(res = i,
n_clusters = length(unique(sil$cluster)),
mean_sil_width = mean(sil$width),
mean_rmsd = mean(rmsd)))
}
}
return(qc)
}
Find best silhouette width
cluster_qc <- cluster_sweep(seurat_object = so_rliger,
embedding = so_rliger@reductions$inmf@cell.embeddings,
resolution = seq(from = 0.3, to = 1.2, by = 0.1))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 114106
## Number of edges: 4243103
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9660
## Number of communities: 26
## Elapsed time: 64 seconds
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 114106
## Number of edges: 4243103
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9602
## Number of communities: 30
## Elapsed time: 59 seconds
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 114106
## Number of edges: 4243103
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9552
## Number of communities: 31
## Elapsed time: 58 seconds
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 114106
## Number of edges: 4243103
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9507
## Number of communities: 37
## Elapsed time: 59 seconds
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 114106
## Number of edges: 4243103
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9467
## Number of communities: 39
## Elapsed time: 64 seconds
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 114106
## Number of edges: 4243103
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9434
## Number of communities: 42
## Elapsed time: 58 seconds
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 114106
## Number of edges: 4243103
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9400
## Number of communities: 43
## Elapsed time: 67 seconds
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 114106
## Number of edges: 4243103
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9367
## Number of communities: 44
## Elapsed time: 69 seconds
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 114106
## Number of edges: 4243103
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9333
## Number of communities: 45
## Elapsed time: 59 seconds
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 114106
## Number of edges: 4243103
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9300
## Number of communities: 47
## Elapsed time: 59 seconds
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
cluster_qc_gathered <- cluster_qc %>%
gather(- c(res), value = 'value', key = 'metric')
ggplot(cluster_qc_gathered, aes(x = res, y = value, color = metric))+
geom_point()+
geom_line()+
theme_bw()+
facet_wrap(~metric, ncol = 1, scales = 'free_y')
Cluster Seurat object and save
so_rliger <- FindClusters(so_rliger,
resolution = 0.6,
algorithm = 1)
# Save seurat object as .rds file
saveRDS(so_rliger, 'analysis_files/s5_xspecies_uinmf_so.rds')
sessionInfo()
sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bluster_1.6.0 SeuratWrappers_0.3.0 rliger_1.0.0
## [4] patchwork_1.1.2 Matrix_1.5-1 cowplot_1.1.1
## [7] scPred_1.9.2 nichenetr_1.1.0 ggalluvial_0.12.3
## [10] SeuratObject_4.1.3 Seurat_4.3.0 forcats_0.5.2
## [13] stringr_1.5.0 dplyr_1.0.10 purrr_0.3.4
## [16] readr_2.1.3 tidyr_1.2.1 tibble_3.1.8
## [19] ggplot2_3.4.0 tidyverse_1.3.2
##
## loaded via a namespace (and not attached):
## [1] scattermore_0.8 ModelMetrics_1.2.2.2 R.methodsS3_1.8.2
## [4] bit64_4.0.5 knitr_1.41 R.utils_2.12.2
## [7] irlba_2.3.5.1 data.table_1.14.6 rpart_4.1.19
## [10] hardhat_1.2.0 doParallel_1.0.17 generics_0.1.3
## [13] BiocGenerics_0.44.0 RANN_2.6.1 proxy_0.4-27
## [16] future_1.29.0 DiagrammeR_1.0.9 bit_4.0.4
## [19] tzdb_0.3.0 spatstat.data_3.0-0 xml2_1.3.3
## [22] lubridate_1.8.0 httpuv_1.6.6 assertthat_0.2.1
## [25] gargle_1.2.1 gower_1.0.0 xfun_0.35
## [28] hms_1.1.2 jquerylib_0.1.4 evaluate_0.18
## [31] promises_1.2.0.1 fansi_1.0.3 caTools_1.18.2
## [34] dbplyr_2.2.1 readxl_1.4.1 igraph_1.3.5
## [37] DBI_1.1.3 htmlwidgets_1.5.4 spatstat.geom_3.0-3
## [40] googledrive_2.0.0 stats4_4.2.2 ellipsis_0.3.2
## [43] riverplot_0.10 crosstalk_1.2.0 backports_1.4.1
## [46] bookdown_0.30 deldir_1.0-6 vctrs_0.5.1
## [49] remotes_2.4.2 ROCR_1.0-11 abind_1.4-5
## [52] caret_6.0-93 cachem_1.0.6 withr_2.5.0
## [55] progressr_0.11.0 checkmate_2.1.0 sctransform_0.3.5
## [58] fdrtool_1.2.17 mclust_6.0.0 goftest_1.2-3
## [61] cluster_2.1.4 lazyeval_0.2.2 crayon_1.5.2
## [64] hdf5r_1.3.7 spatstat.explore_3.0-5 recipes_1.0.3
## [67] pkgconfig_2.0.3 labeling_0.4.2 nlme_3.1-160
## [70] vipor_0.4.5 nnet_7.3-18 rlang_1.0.6
## [73] globals_0.16.2 lifecycle_1.0.3 miniUI_0.1.1.1
## [76] rsvd_1.0.5 modelr_0.1.10 cellranger_1.1.0
## [79] randomForest_4.7-1.1 polyclip_1.10-0 matrixStats_0.63.0
## [82] lmtest_0.9-40 zoo_1.8-11 reprex_2.0.2
## [85] base64enc_0.1-3 beeswarm_0.4.0 ggridges_0.5.4
## [88] GlobalOptions_0.1.2 googlesheets4_1.0.1 png_0.1-7
## [91] viridisLite_0.4.1 rjson_0.2.21 bitops_1.0-7
## [94] R.oo_1.25.0 KernSmooth_2.23-20 visNetwork_2.1.2
## [97] pROC_1.18.0 shape_1.4.6 parallelly_1.32.1
## [100] spatstat.random_3.0-1 jpeg_0.1-9 S4Vectors_0.36.0
## [103] scales_1.2.1 magrittr_2.0.3 plyr_1.8.7
## [106] ica_1.0-3 compiler_4.2.2 RColorBrewer_1.1-3
## [109] clue_0.3-63 fitdistrplus_1.1-8 cli_3.4.0
## [112] listenv_0.8.0 pbapply_1.6-0 htmlTable_2.4.1
## [115] Formula_1.2-4 MASS_7.3-58.1 tidyselect_1.2.0
## [118] stringi_1.7.8 highr_0.9 yaml_2.3.6
## [121] latticeExtra_0.6-30 ggrepel_0.9.2 grid_4.2.2
## [124] sass_0.4.4 tools_4.2.2 future.apply_1.10.0
## [127] parallel_4.2.2 circlize_0.4.15 rstudioapi_0.14
## [130] foreach_1.5.2 foreign_0.8-83 gridExtra_2.3
## [133] prodlim_2019.11.13 rmdformats_1.0.4 farver_2.1.1
## [136] Rtsne_0.16 digest_0.6.29 BiocManager_1.30.19
## [139] rgeos_0.5-9 FNN_1.1.3.1 shiny_1.7.3
## [142] lava_1.7.0 Rcpp_1.0.9 broom_1.0.1
## [145] later_1.3.0 harmony_0.1.1 RcppAnnoy_0.0.20
## [148] httr_1.4.4 ComplexHeatmap_2.12.1 colorspace_2.0-3
## [151] rvest_1.0.3 fs_1.5.2 tensor_1.5
## [154] reticulate_1.26 IRanges_2.32.0 splines_4.2.2
## [157] uwot_0.1.14 spatstat.utils_3.0-1 sp_1.5-1
## [160] plotly_4.10.1 xtable_1.8-4 jsonlite_1.8.3
## [163] timeDate_4021.106 ipred_0.9-13 R6_2.5.1
## [166] Hmisc_4.7-2 pillar_1.8.1 htmltools_0.5.3
## [169] mime_0.12 BiocParallel_1.32.4 glue_1.6.2
## [172] fastmap_1.1.0 DT_0.26 BiocNeighbors_1.16.0
## [175] class_7.3-20 codetools_0.2-18 utf8_1.2.2
## [178] lattice_0.20-45 bslib_0.4.1 spatstat.sparse_3.0-0
## [181] ggbeeswarm_0.6.0 leiden_0.4.3 interp_1.1-3
## [184] survival_3.4-0 limma_3.54.0 rmarkdown_2.18
## [187] munsell_0.5.0 e1071_1.7-12 GetoptLong_1.0.5
## [190] iterators_1.0.14 haven_2.5.1 reshape2_1.4.4
## [193] gtable_0.3.1